/**
 * Copyright (c) 2025 Huawei Technologies Co., Ltd.
 * This file is a part of the CANN Open Software.
 * Licensed under CANN Open Software License Agreement Version 1.0 (the
 * "License"). Please refer to the License for details. You may not use this
 * file except in compliance with the License. THIS SOFTWARE IS PROVIDED ON AN
 * "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS
 * FOR A PARTICULAR PURPOSE. See LICENSE in the root of the software repository
 * for the full text of the License.
 */

#ifndef EXAMPLES_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_CUSTOM_H
#define EXAMPLES_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_CUSTOM_H
#include "kernel_operator.h"


namespace MyCustomKernel {

constexpr int32_t INIT_SIZE = 256;

class KernelInitGlobalMemory {
public:
    __aicore__ inline KernelInitGlobalMemory() {}
    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
    {
        xGm.SetGlobalBuffer((__gm__ float*)x + INIT_SIZE * AscendC::GetBlockIdx(), INIT_SIZE);
        yGm.SetGlobalBuffer((__gm__ float*)y + INIT_SIZE * AscendC::GetBlockIdx(), INIT_SIZE);
        zGm.SetGlobalBuffer((__gm__ float*)z + INIT_SIZE * AscendC::GetBlockIdx(), INIT_SIZE);
        // init zGm value
        AscendC::InitGlobalMemory(zGm, INIT_SIZE, (float)(AscendC::GetBlockIdx()));

        //需要插MTE2等MTE3的同步
        AscendC::TEventID eventIdMTE3ToMTE2 = GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_MTE2);
        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(eventIdMTE3ToMTE2);
        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(eventIdMTE3ToMTE2);

        pipe.InitBuffer(inQueueX, 1, INIT_SIZE * sizeof(float));
        pipe.InitBuffer(inQueueY, 1, INIT_SIZE * sizeof(float));
        pipe.InitBuffer(outQueueZ, 1, INIT_SIZE * sizeof(float));
    }
    __aicore__ inline void Process()
    {
        CopyIn();
        Compute();
        CopyOut();
    }
private:
    __aicore__ inline void CopyIn()
    {
        AscendC::LocalTensor<float> xLocal = inQueueX.AllocTensor<float>();
        AscendC::LocalTensor<float> yLocal = inQueueY.AllocTensor<float>();
        AscendC::DataCopy(xLocal, xGm, INIT_SIZE);
        AscendC::DataCopy(yLocal, yGm, INIT_SIZE);
        inQueueX.EnQue(xLocal);
        inQueueY.EnQue(yLocal);
    }
    __aicore__ inline void Compute()
    {
        AscendC::LocalTensor<float> xLocal = inQueueX.DeQue<float>();
        AscendC::LocalTensor<float> yLocal = inQueueY.DeQue<float>();
        AscendC::LocalTensor<float> zLocal = outQueueZ.AllocTensor<float>();
        AscendC::Add(zLocal, xLocal, yLocal, INIT_SIZE);
        outQueueZ.EnQue<float>(zLocal);
        inQueueX.FreeTensor(xLocal);
        inQueueY.FreeTensor(yLocal);
    }
    __aicore__ inline void CopyOut()
    {
        AscendC::LocalTensor<float> zLocal = outQueueZ.DeQue<float>();
        // add result to zGm
        AscendC::SetAtomicAdd<float>();
        AscendC::DataCopy(zGm, zLocal, INIT_SIZE);
        AscendC::SetAtomicNone();
        outQueueZ.FreeTensor(zLocal);
    }
private:
    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX, inQueueY;
    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueZ;
    AscendC::GlobalTensor<float> xGm;
    AscendC::GlobalTensor<float> yGm;
    AscendC::GlobalTensor<float> zGm;
    AscendC::TPipe pipe;
};

} // namespace MyCustomKernel

#endif // EXAMPLES_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_CUSTOM_H
